from plotly.offline import init_notebook_mode
init_notebook_mode()
import pandas as pd
df_GeneralData = pd.read_excel(r'src\data\rehospitalization.xlsx', sheet_name='GeneralData')
mean_weight = df_GeneralData['משקל'].mean()
df_GeneralData['משקל'].fillna(mean_weight, inplace=True)
mean_height = df_GeneralData['גובה'].mean()
df_GeneralData['גובה'].fillna(mean_weight, inplace=True)
# Calculate BMI Column
df_GeneralData.loc[df_GeneralData['BMI'].isna(), 'BMI'] = df_GeneralData.apply(
lambda row: row['משקל'] / ((row['גובה'] /100 )**2) if pd.notnull(row['משקל']) and pd.notnull(row['גובה']) else None,
axis=1
)
# Filling empty values in the column in the value "לא ידוע"
df_GeneralData['השכלה'] = df_GeneralData['השכלה'].apply(lambda x: None if isinstance(x, (int, float)) else x)
df_GeneralData['השכלה'].fillna('לא ידוע', inplace=True)
# Convert 'מספר ילדים' to numeric, setting non-numeric values to NaN
df_GeneralData['מספר ילדים'] = pd.to_numeric(df_GeneralData['מספר ילדים'], errors='coerce')
# Calculate the median of the numeric values, ignoring NaN values
median_value = df_GeneralData['מספר ילדים'].median()
# Replace non-numeric (NaN) values with the median
df_GeneralData['מספר ילדים'].fillna(median_value, inplace=True)
marital_status_mapping = {
'לא ידוע': 1,
'רווק': 2,
'נשוי': 3,
'פרוד': 4,
'גרוש': 5,
'אלמן': 6
}
# Apply the mapping to the 'מצב משפחתי' column
df_GeneralData['מצב משפחתי'] = df_GeneralData['מצב משפחתי'].map(marital_status_mapping)
# Save the updated DataFrame to a new Excel file
output_file_path = 'src\data\GeneralData.csv'
df_GeneralData.to_csv(output_file_path, index = False, encoding='utf-8-sig')
import pandas as pd
import plotly.express as px
import plotly.io as pio
# Function to create and customize a histogram plot
def create_plot(df: pd.DataFrame, x_col: str, title: str):
fig1 = px.histogram(df, x=x_col, barmode="group"). \
update_layout(
yaxis_title="Count", # Set the y-axis title
title=title, # Set the plot title
title_font_size=30 # Set the font size for the title
)
fig1.update_xaxes(tickfont_size=20, title=" ")
fig1.update_yaxes(title_font={"size": 20})
fig1.update_traces(opacity=0.9)
fig1.show()
# Load data
data = pd.read_excel(r'src\data\rehospitalization.xlsx', sheet_name='erBeforeHospitalization2')
# Task 14:
# Print the data types of the columns in the dataset
print('Data Types: \n', data.dtypes)
# Print the number of unique patients in the dataset
print('Unique number of patients: ', len(data['Patient'].unique()))
# Calculate the admission time in days for the first hospitalization event
data['admission_time_1'] = (data['Release_Date'] - data['Admission_Entry_Date']).dt.days
# Histograms
create_plot(data, 'admission_time_1', 'Histogram of Admission Time (days)')
create_plot(data, 'מחלקה מאשפזת1', 'Patients by Admission Department')
# Calculate the ER time in hours for the second hospitalization event
data['admission_time_2'] = (data['ev_Release_Time'] - data['ev_Admission_Date']).dt.total_seconds() / 3600
# Histograms
create_plot(data, 'admission_time_2', 'Histogram of ER Time (hours)')
create_plot(data, 'דרך הגעה למיון', 'ER Arrival Histogram')
create_plot(data, 'מיון', 'ER Type Histogram')
Data Types: Patient int64 Admission_Medical_Record int64 Admission_Entry_Date datetime64[ns] Release_Date datetime64[ns] מחלקה מאשפזת1 int64 Medical_Record float64 ev_Admission_Date datetime64[ns] ev_Release_Time datetime64[ns] דרך הגעה למיון object מיון object urgencyLevelTime float64 אבחנות במיון object codeDoctor float64 Admission_Medical_Record2 int64 Admission_Entry_Date2 datetime64[ns] Release_Date2 datetime64[ns] מחלקה מאשפזת2 object dtype: object Unique number of patients: 4515
import pandas as pd
import plotly.express as px
import plotly.io as pio
# Function to create and customize a histogram plot
def create_plot(df: pd.DataFrame, x_col: str, title: str):
fig1 = px.histogram(df, x=x_col, barmode="group"). \
update_layout(
yaxis_title="Count", # Set the y-axis title
title=title, # Set the plot title
title_font_size=30 # Set the font size for the title
)
fig1.update_xaxes(tickfont_size=20, title=" ")
fig1.update_yaxes(title_font={"size": 20})
fig1.update_traces(opacity=0.9)
fig1.show()
# Function to convert a continuous variable into categories based on percentiles
def create_categories_by_precentile(data: pd.DataFrame, groups_num: int):
data['range_days_between_admissions'] = pd.qcut(data['days_between_admissions'], q=groups_num).astype('str') # Create percentile-based bins for the 'days_between_admissions' column
data['range_days_between_admissions'] = data['range_days_between_admissions'].str.slice(start=1, stop=-1)
bins_map = {value: idx for idx, value in enumerate(data['range_days_between_admissions'].unique())}
data['category_days_between_admissions'] = data['range_days_between_admissions'].map(bins_map) # Map the ranges to their corresponding category index
return data
# Load data
data = pd.read_excel(r'src\data\rehospitalization.xlsx', sheet_name='erBeforeHospitalization2')
# Task 16:
# Calculate the number of days between the first and second admissions
data['days_between_admissions'] = (data['Admission_Entry_Date2'] - data['Release_Date']).dt.days
# Histogram
create_plot(data, 'days_between_admissions', 'Histogram of Days Passed Between Admissions')
print("We can see that the days between admissions are from 0 to 29, most of them are 0-7 days.")
# Convert the continuous 'days_between_admissions' into categories based on percentiles
data = create_categories_by_precentile(data, 5)
# Histogram
create_plot(data, 'category_days_between_admissions', 'Histogram of Days Passed Between Admissions by Category')
print("We can see the new distribution for 5 categories by precentile.")
print("The categories are:")
print('Categories created by days:', data['range_days_between_admissions'].unique())
We can see that the days between admissions are from 0 to 29, most of them are 0-7 days.
We can see the new distribution for 5 categories by precentile. The categories are: Categories created by days: ['8.0, 14.0' '21.0, 29.0' '-0.001, 4.0' '4.0, 8.0' '14.0, 21.0']
from scipy.stats import chi2_contingency
# Visualize relationships between categorical variables and re-hospitalization
def create_plot_relationship(df, x_col, title):
fig = px.histogram(df, x=x_col, barmode='group'). \
update_layout(yaxis_title="Number of Recordings",
title=title,
title_font_size=30)
fig.update_xaxes(tickfont_size=20, title=" ")
fig.update_yaxes(title_font={"size": 20})
fig.update_traces(opacity=0.9)
return fig
# Relationship between Education and Rehospitalization
create_plot_relationship(df_GeneralData, 'השכלה', 'Rehospitalization by Education Level')
# Relationship between Number of Children and Rehospitalization
create_plot_relationship(df_GeneralData, 'מספר ילדים', 'Rehospitalization by Number of Children')
# Relationship between Marital Status and Rehospitalization
create_plot_relationship(df_GeneralData, 'מצב משפחתי', 'Rehospitalization by Marital Status')
# Chi-square tests for relationships
def chi_square_test(df, col):
contingency_table = pd.crosstab(df[col],data['days_between_admissions'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square test for {col}: Chi2 = {chi2}, p-value = {p}")
# Apply Chi-square tests
chi_square_test(df_GeneralData, 'השכלה')
chi_square_test(df_GeneralData, 'מספר ילדים')
chi_square_test(df_GeneralData, 'מצב משפחתי')
Chi-square test for השכלה: Chi2 = 18719.003793880023, p-value = 0.3535239781225367 Chi-square test for מספר ילדים: Chi2 = 399.64626372514425, p-value = 0.8868501584051448 Chi-square test for מצב משפחתי: Chi2 = 123.44004841993777, p-value = 0.30085187806882535
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Load data
df_hospitalization1 = pd.read_excel(r'src\data\rehospitalization.xlsx', sheet_name='hospitalization1')
# Convert to datetime if not already
df_hospitalization1['Admission_Entry_Date'] = pd.to_datetime(df_hospitalization1['Admission_Entry_Date'])
df_hospitalization1['Release_Date'] = pd.to_datetime(df_hospitalization1['Release_Date'])
# Filter and group data for each department (1 to 5)
departments = [1, 2, 3, 4, 5]
# Function to create time series plots for admissions and releases
def create_time_series_plot(df, date_col, group_col, title):
monthly_counts = df.groupby([pd.Grouper(key=date_col, freq='M'), group_col]).size().reset_index(name='Count')
# Create time series plot
fig = px.line(monthly_counts, x=date_col, y='Count', color=group_col,
title=title,
labels={date_col: 'Date', 'Count': 'Number of Admissions/Releases'})
fig.update_layout(title_font_size=30)
fig.show()
# Function to perform seasonal decomposition for a given department
def seasonal_decomposition(df, department_number):
# Filter data for the department
department_data = df[df['unitName1'] == department_number]
# Admissions time series
department_admissions = department_data.set_index('Admission_Entry_Date').resample('M').size()
# Decomposition
decomposition = sm.tsa.seasonal_decompose(department_admissions, model='additive')
fig = decomposition.plot()
fig.set_size_inches(14, 7)
plt.suptitle(f'Seasonal Decomposition of Admissions for Department {department_number}', fontsize=16)
plt.show()
# Loop through each department (1 to 5) for time series plots and decomposition
for department in departments:
# Filter data for the department
department_data = df_hospitalization1[df_hospitalization1['unitName1'] == department]
# Time series plots for admissions and releases
create_time_series_plot(department_data, 'Admission_Entry_Date', 'unitName1', f'Admissions Over Time for Department {department}')
create_time_series_plot(department_data, 'Release_Date', 'unitName1', f'Releases Over Time for Department {department}')
# Seasonal decomposition for admissions
seasonal_decomposition(df_hospitalization1, department)
Admissions: The trend in admissions reflects both seasonal fluctuations and potential healthcare trends over time. Strong seasonality suggests that certain departments, particularly those handling acute conditions, are highly influenced by external factors like flu seasons or public health crises.
Releases: Releases generally follow the admissions pattern with some lag, but discrepancies between the two can point to inefficiencies in the hospital system or specific challenges in managing patient flow.
Irregularities: Large residuals in either admissions or releases indicate that there are external factors or one-off events (such as pandemics) that heavily influenced the normal patterns.
In conclusion, understanding the dynamic between admissions and releases is crucial for hospital management, as it enables better planning for resource allocation, staffing, and patient flow optimization, particularly in departments facing seasonal variations or external pressures.